Wykorzystane biblioteki
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
library(knitr)
library(caret)
library(data.table)
library(tibble)
Wczytywanie danych
removable_columns <- c("title", "pdb_code", "res_id", "chain_id", "local_res_atom_count", "local_res_atom_non_h_occupancy_sum", "local_res_atom_non_h_electron_occupancy_sum", "local_res_atom_C_count", "local_res_atom_N_count", "local_res_atom_O_count", "local_res_atom_S_count", "dict_atom_C_count", "dict_atom_N_count", "dict_atom_O_count", "dict_atom_S_count", "skeleton_data", "skeleton_cycle_4", "skeleton_diameter", "skeleton_cycle_6", "skeleton_cycle_7", "skeleton_closeness_006_008", "skeleton_closeness_002_004", "skeleton_cycle_3", "skeleton_avg_degree", "skeleton_closeness_004_006", "skeleton_closeness_010_012", "skeleton_closeness_012_014", "skeleton_edges", "skeleton_radius", "skeleton_cycle_8_plus", "skeleton_closeness_020_030", "skeleton_deg_5_plus", "skeleton_closeness_016_018", "skeleton_closeness_008_010", "skeleton_closeness_018_020", "skeleton_average_clustering", "skeleton_closeness_040_050", "skeleton_closeness_014_016", "skeleton_center", "skeleton_closeness_000_002", "skeleton_density", "skeleton_closeness_030_040", "skeleton_deg_4", "skeleton_deg_0", "skeleton_deg_1", "skeleton_deg_2", "skeleton_deg_3", "skeleton_graph_clique_number", "skeleton_nodes", "skeleton_cycles", "skeleton_cycle_5", "skeleton_closeness_050_plus", "skeleton_periphery", "fo_col", "fc_col", "weight_col", "grid_space", "solvent_radius", "solvent_opening_radius", "part_step_FoFc_std_min", "part_step_FoFc_std_max", "part_step_FoFc_std_step")
data <- fread("./all_summary.csv", nrows = 10000, header = TRUE, drop = removable_columns)
dim(data)
## [1] 10000 350
Przetwarzanie brakujących danych
dim(data)
## [1] 10000 350
data <- data %>%
drop_na()
dim(data)
## [1] 8958 350
Usuwanie niepotrzebnych ligandów
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data %>% filter(!res_name %in% deletable_res_name)
dim(data)
## [1] 8910 350
Podsumowanie danych
statistics <- data %>%
select(res_name, blob_volume_coverage, blob_volume_coverage_second)
kable(summary(statistics))
|
Length:8910 |
Min. :0.02305 |
Min. :0.00000 |
|
Class :character |
1st Qu.:0.50648 |
1st Qu.:0.00000 |
|
Mode :character |
Median :0.72244 |
Median :0.00000 |
|
NA |
Mean :0.66784 |
Mean :0.02067 |
|
NA |
3rd Qu.:0.86480 |
3rd Qu.:0.00000 |
|
NA |
Max. :1.00000 |
Max. :0.95385 |
dim(data)
## [1] 8910 350
50 najpopularniejszych ligandów
popular_ligands <- data %>%
select(res_name) %>%
count(res_name, sort = TRUE) %>%
slice(1:50)
popular_names_vector <- popular_ligands %>%
pull(res_name)
data <- data %>% filter(res_name %in% popular_names_vector)
dim(data)
## [1] 6239 350
Liczność najpopularniejszych ligandów według nazwy
plot_ligands <- ggplot(popular_ligands, aes(x = reorder(res_name, -n), y = n, fill = n)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
ylab("liczność") +
labs(title = "Liczność ligandów według nazwy")
ggplotly(plot_ligands)
Korelacja między zmiennymi
data %>%
select_if(is.numeric) %>%
cor() %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(rowname2, value, -rowname) %>%
filter(value >= 0.9999, rowname != rowname2)
## rowname rowname2 value
## 1 part_00_max local_max 1.0000000
## 2 part_01_max local_max 1.0000000
## 3 part_02_max local_max 1.0000000
## 4 part_00_max_over_std local_max_over_std 1.0000000
## 5 part_01_max_over_std local_max_over_std 1.0000000
## 6 part_02_max_over_std local_max_over_std 0.9999999
## 7 part_00_density_segments_count part_00_shape_segments_count 1.0000000
## 8 part_00_shape_segments_count part_00_density_segments_count 1.0000000
## 9 part_00_shape_M000 part_00_volume 1.0000000
## 10 part_00_density_M000 part_00_electrons 1.0000000
## 11 local_max part_00_max 1.0000000
## 12 part_01_max part_00_max 1.0000000
## 13 part_02_max part_00_max 1.0000000
## 14 local_max_over_std part_00_max_over_std 1.0000000
## 15 part_01_max_over_std part_00_max_over_std 1.0000000
## 16 part_02_max_over_std part_00_max_over_std 0.9999999
## 17 part_00_volume part_00_shape_M000 1.0000000
## 18 part_01_density_FL part_00_density_FL 0.9999229
## 19 part_01_density_I4 part_00_density_I4 0.9999031
## 20 part_00_electrons part_00_density_M000 1.0000000
## 21 part_01_density_segments_count part_01_shape_segments_count 1.0000000
## 22 part_01_shape_segments_count part_01_density_segments_count 1.0000000
## 23 part_01_shape_M000 part_01_volume 1.0000000
## 24 part_01_density_M000 part_01_electrons 1.0000000
## 25 local_max part_01_max 1.0000000
## 26 part_00_max part_01_max 1.0000000
## 27 part_02_max part_01_max 1.0000000
## 28 local_max_over_std part_01_max_over_std 1.0000000
## 29 part_00_max_over_std part_01_max_over_std 1.0000000
## 30 part_02_max_over_std part_01_max_over_std 0.9999999
## 31 part_01_volume part_01_shape_M000 1.0000000
## 32 part_00_density_FL part_01_density_FL 0.9999229
## 33 part_00_density_I4 part_01_density_I4 0.9999031
## 34 part_01_electrons part_01_density_M000 1.0000000
## 35 part_02_density_segments_count part_02_shape_segments_count 1.0000000
## 36 part_02_shape_segments_count part_02_density_segments_count 1.0000000
## 37 part_02_shape_M000 part_02_volume 1.0000000
## 38 part_02_density_M000 part_02_electrons 1.0000000
## 39 local_max part_02_max 1.0000000
## 40 part_00_max part_02_max 1.0000000
## 41 part_01_max part_02_max 1.0000000
## 42 local_max_over_std part_02_max_over_std 0.9999999
## 43 part_00_max_over_std part_02_max_over_std 0.9999999
## 44 part_01_max_over_std part_02_max_over_std 0.9999999
## 45 part_02_volume part_02_shape_M000 1.0000000
## 46 part_02_electrons part_02_density_M000 1.0000000
Rozkłady gęstościowe liczb
Atomów
plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
geom_density(alpha = .3, fill = "#00CECB", color = NA) +
xlab("liczność atomów") +
ylab("gęstość") +
labs(title = "Rozkład gęstościowy atomów")
ggplotly(plot_atom)
Elektronów
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
xlab("liczność elektronów") +
ylab("gęstość") +
labs(title = "Rozkład gęstościowy elektronów")
ggplotly(plot_electron)
Rozkład wartości kolumn part_01
remove_outliers <- function(data, na.rm = TRUE, ...) {
qnt <- quantile(data, probs=c(.25, .75), na.rm = na.rm, ...)
iqr <- 1.5 * IQR(data, na.rm = na.rm)
data_no_outliers <- data
data_no_outliers[data < (qnt[1] - iqr)] <- NA
data_no_outliers[data > (qnt[2] + iqr)] <- NA
data_no_outliers[!is.na(data_no_outliers)]
data_no_outliers
}
plot_part_data <- data %>%
select(contains("part_01"))
dim(plot_part_data)
## [1] 6239 106
plot_part_data <- plot_part_data %>%
sapply(remove_outliers) %>%
as.data.frame()
dim(plot_part_data)
## [1] 6239 106
plot_part_data <- plot_part_data %>%
drop_na()
dim(plot_part_data)
## [1] 3028 106
plot_part_data <- plot_part_data %>%
gather(part, value, 1:106)
plot_ly(plot_part_data, x = plot_part_data$value, y = plot_part_data$part, type = 'box', height = 2000)